#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.graph_objs as go
import plotly.express as px
#importing dataset
data=pd.read_csv('CustomersChurn.csv')
data
| CLIENTNUM | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | ... | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Attrition_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | ... | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | Existing Customer |
| 1 | 818770008 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | ... | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | Existing Customer |
| 2 | 713982108 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | ... | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | Existing Customer |
| 3 | 769911858 | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | ... | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | Existing Customer |
| 4 | 709106358 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | ... | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | Existing Customer |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10122 | 772366833 | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | ... | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 | Existing Customer |
| 10123 | 710638233 | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | ... | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 | Attrited Customer |
| 10124 | 716506083 | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | ... | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 | Attrited Customer |
| 10125 | 717406983 | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | ... | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 | Attrited Customer |
| 10126 | 714337233 | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | ... | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 | Attrited Customer |
10127 rows × 21 columns
#missing values
data.isnull().sum()
CLIENTNUM 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 Attrition_Flag 0 dtype: int64
#Duplicates
data.duplicated().sum()
0
gender_counts = data['Gender'].value_counts()
fig = px.pie(
names=gender_counts.index,
values=gender_counts.values,
color_discrete_sequence=['skyblue', 'green'],
labels=['Male', 'Female'],
width=800,
height=350,
)
fig.update_layout(title_text='Distribution of Gender', title_x=0.48,
title_y=0.95
)
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60 ))
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
#Proportion of Attrited vs Existing Customer
# Create the pie chart using Plotly Express
# Calculate the value counts of Attrition_Flag
attrition_counts = data['Attrition_Flag'].value_counts()
# Create the pie chart using Plotly Express
fig = px.pie(
names=attrition_counts.index,
values=attrition_counts.values,
color_discrete_sequence=['skyblue', 'yellow'],
labels=['Attrited Customer', 'Existing Customer'],
width=800,
height=350
)
fig.update_layout(title_text='Proportion of Attrited vs Existing Customer', title_x=0.44, title_y=0.9)
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60))
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
#Proportion of Attrited Customers by Marital status
# Count occurrences of Attrition for each Education_Level
counts = data.groupby(['Marital_Status', 'Attrition_Flag']).size().unstack(fill_value=0)
# Create the bar chart using Plotly Express
fig = px.bar(counts, x=counts.index, y=counts.columns,
color_discrete_map={'Attrited Customer': 'Gold', 'Existing Customer': 'lightyellow'},
width=800, height=500, text_auto='.2s')
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60),
)
fig.update_layout(title_text='Proportion of Attrited Customers by Marital status', title_x=0.44, title_y=0.95)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Education Level')
fig.update_yaxes(visible=True, title='Values')
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
#Proportion of Attrited Customers by Education Level
# Count occurrences of Attrition for each Education_Level
counts = data.groupby(['Education_Level', 'Attrition_Flag']).size().unstack(fill_value=0)
# Create the bar chart using Plotly Express
fig = px.bar(counts, x=counts.index, y=counts.columns,
color_discrete_map={'Attrited Customer': 'gold', 'Existing Customer': 'lightskyblue'},
width=800, height=500, text_auto='.2s')
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60),
)
fig.update_layout(title_text='Proportion of Attrited Customers by Education Level', title_x=0.44, title_y=0.95)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Education Level')
fig.update_yaxes(visible=True, title='Values')
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
#Proportion of Attrited Customers by Income_Category
# Assuming you have already calculated counts DataFrame using groupby
counts = data.groupby(['Income_Category', 'Attrition_Flag']).size().unstack(fill_value=0)
# Create the bar chart using Plotly Express
fig = px.bar(counts, x=counts.index, y=counts.columns,
color_discrete_map={'Attrited Customer': 'darkblue', 'Existing Customer': 'skyblue'},
width=800, height=500, text_auto='.2s')
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60),
)
fig.update_layout(title_text='Proportion of Attrited Customers by Income_Category', title_x=0.44, title_y=0.95)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Income Category')
fig.update_yaxes(visible=True, title='Values')
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
#Proportion by Educational level
# Calculate value counts of Education_Level column
education_counts = data.Education_Level.value_counts()
# Create the pie chart using Plotly Express
fig = px.pie(
names=education_counts.index,
values=education_counts.values,
color_discrete_sequence=['skyblue', 'yellow'],
labels=['Attrited Customer', 'Existing Customer'],
width=800,
height=350
)
fig.update_layout(title_text='Proportion by Educational level', title_x=0.44, title_y=0.95)
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60))
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
##Propotion Of Different Card Categories
Card_Category_counts = data.Card_Category.value_counts()
fig = px.pie(
names=Card_Category_counts.index,
values=Card_Category_counts.values,
color_discrete_sequence=['skyblue', 'yellow','gold','red'],
labels=['Attrited Customer', 'Existing Customer'],
width=800,
height=350
)
fig.update_layout(title_text='Propotion Of Different Card Categories', title_x=0.44, title_y=0.95)
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60))
fig.layout.template = 'plotly_dark'
# Show the plot
fig.show()
#Propotion Of Credit limit by Income category
# Create the bar chart using Plotly Express
fig=px.bar(data, x='Income_Category',y='Credit_Limit',width=800, height=500, text_auto='.2s',
color_discrete_sequence=['red']
)
# Update layout
fig.update_layout(
margin=dict(t=50, b=50, l=60, r=60), template = 'plotly_dark',
title_text='Propotion Of Credit limit by Income category',
title_x=0.44, title_y=0.95
)
fig.update_traces(textfont_size=16, textangle=0, textposition="outside", cliponaxis=True)
fig.update_xaxes(visible=True, title='Income Category')
fig.update_yaxes(visible=True, title='Credit Limit')
# Show the plot
fig.show()
#converting string to numerical value
data['Attrition_Flag']=data['Attrition_Flag'].map({'Existing Customer':0,'Attrited Customer':1})
data.head()
| CLIENTNUM | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | ... | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Attrition_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | ... | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 0 |
| 1 | 818770008 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | ... | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 0 |
| 2 | 713982108 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | ... | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 0 |
| 3 | 769911858 | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | ... | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 0 |
| 4 | 709106358 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | ... | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 0 |
5 rows × 21 columns
#dont need this colunm
X = data.drop(columns=['Gender','Education_Level','Income_Category','Marital_Status','Card_Category'], inplace=True)
y = data['Attrition_Flag']
data_array = np.array(data)
data
| CLIENTNUM | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Attrition_Flag | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | 45 | 3 | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 0 |
| 1 | 818770008 | 49 | 5 | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 0 |
| 2 | 713982108 | 51 | 3 | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 0 |
| 3 | 769911858 | 40 | 4 | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 0 |
| 4 | 709106358 | 40 | 3 | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10122 | 772366833 | 50 | 2 | 40 | 3 | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 | 0 |
| 10123 | 710638233 | 41 | 2 | 25 | 4 | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 | 1 |
| 10124 | 716506083 | 44 | 1 | 36 | 5 | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 | 1 |
| 10125 | 717406983 | 30 | 2 | 36 | 4 | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 | 1 |
| 10126 | 714337233 | 43 | 2 | 25 | 6 | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 | 1 |
10127 rows × 16 columns
X = data.iloc[:, : -1]
y = data.iloc[:, -1]
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
X = oe.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)
#Features Scalling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train)
print(X_test)
[[0.67397531 0.47727273 0.4 ... 0.37096774 0.26055489 0.07276507] [0.75140741 0.63636364 0.2 ... 0.75 0.47527141 0.26611227] [0.60918519 0.34090909 0.8 ... 0.54032258 0.75271411 0.29209979] ... [0.5602963 0.77272727 0.2 ... 0.45967742 0.67792521 0.17047817] [0.56967901 0.56818182 0.6 ... 0.87903226 0.53920386 0.04158004] [0.71812346 0.61363636 0.2 ... 0.21774194 0.43667069 0.84199584]] [[0.96958025 0.40909091 1. ... 0.39516129 0.66827503 0.1008316 ] [0.36661728 0.40909091 0.6 ... 0.40322581 0.28829916 0. ] [0.85264198 0.25 0.6 ... 0.45967742 0.24728589 0.78690229] ... [0.67930864 0.75 0.2 ... 0.65322581 0.58504222 0.33575884] [0.67595062 0.40909091 0.2 ... 0.47580645 0.31966224 0.07276507] [0.20493827 0.27272727 0. ... 0.21774194 0.23763571 0.13513514]]
#applying model on the training set
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
#to fill the missing value
imputer= SimpleImputer(strategy='mean')
# Fit the model on the training data
imputer.fit(X_train, y_train)
SimpleImputer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SimpleImputer()
clf=RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
clf.fit(X_train, y_train)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
#predicting result
pred=(clf.predict(scaler.transform([[768805383,45,3,39,5,1, 3, 12691, 777,11914, 1.335,1144,42,1.625,0.061]])))
results= 'This customer will not churn' if pred == 0 else "This customer will churn"
print(results)
This customer will not churn
pred=(clf.predict(scaler.transform([[71174388,43,3,35,5,2,3,4026,0,4026,0.483,1237,32,0.6,0]])))
results= 'This customer will not churn' if pred == 0 else "This customer will churn"
print(results)
This customer will churn
pred=(clf.predict(scaler.transform([[711791583,52,1,40,1,2,2,2317,0,2317,1.005,884,19,0.727,0]])))
results= 'This customer will not churn' if pred == 0 else "This customer will churn"
print(results)
This customer will churn
y_pred = clf.predict(X_test)
y_pred
array([0, 1, 0, ..., 0, 0, 0], dtype=int64)
#Confusion Matrix
confusionMatrix = confusion_matrix(y_test, y_pred)
#Create ConfusionMatrixDisplay object
ConfusionMatrixDisplay.from_estimator(clf,X_test,y_test)
print(confusionMatrix)
accuracy_score(y_test, y_pred)
[[2554 29] [ 118 338]]
0.9516288252714709
print (classification_report(y_pred, y_test))
precision recall f1-score support
0 0.99 0.96 0.97 2672
1 0.74 0.92 0.82 367
accuracy 0.95 3039
macro avg 0.87 0.94 0.90 3039
weighted avg 0.96 0.95 0.95 3039
#applying SVC moddel
svc=SVC(kernel='rbf',random_state=0)
svc.fit(X_train, y_train)
pred_svc=svc.predict(X_test)
#Confusion Matrix
confusionMatrix = confusion_matrix(y_test, pred_svc)
#Create ConfusionMatrixDisplay object
ConfusionMatrixDisplay.from_estimator(svc,X_test,y_test)
print(confusionMatrix)
accuracy_score(y_test, pred_svc)
[[ 322 134] [ 60 2523]]
0.9361632115827575
print (classification_report(pred_svc, y_test))
precision recall f1-score support
0 0.98 0.95 0.96 2657
1 0.71 0.84 0.77 382
accuracy 0.94 3039
macro avg 0.84 0.90 0.87 3039
weighted avg 0.94 0.94 0.94 3039
#applying Adaboostclassifier moddel
abc=AdaBoostClassifier()
abc.fit(X_train, y_train)
pred_abc=abc.predict(X_test)
#Confusion Matrix
confusionMatrix = confusion_matrix(y_test, pred_abc)
#Create ConfusionMatrixDisplay object
ConfusionMatrixDisplay.from_estimator(abc,X_test,y_test)
print(confusionMatrix)
accuracy_score(y_test, pred_abc)
[[ 378 78] [ 51 2532]]
0.9575518262586377
print (classification_report(pred_abc, y_test))
precision recall f1-score support
0 0.98 0.97 0.98 2610
1 0.83 0.88 0.85 429
accuracy 0.96 3039
macro avg 0.90 0.93 0.91 3039
weighted avg 0.96 0.96 0.96 3039
clf=RandomForestClassifier(random_state=42)
abc=AdaBoostClassifier(random_state=42,learning_rate=0.7)
svc=SVC(random_state=42,kernel='rbf')
f1_cross_val_scores = cross_val_score(clf,X_train, y_train,cv=5,scoring='f1')
ada_f1_cross_val_scores=cross_val_score(abc,X_train,y_train,cv=5,scoring='f1')
svm_f1_cross_val_scores=cross_val_score(svc, X_train,y_train,cv=5,scoring='f1')
fig = make_subplots(rows=3, cols=1,shared_xaxes=True,subplot_titles=('Random Forest Cross Val Scores',
'Adaboost Cross Val Scores',
'SVM Cross Val Scores'))
fig.add_trace(
go.Scatter(x=list(range(0,len(f1_cross_val_scores))),y=f1_cross_val_scores,name='Random Forest'),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=list(range(0,len(ada_f1_cross_val_scores))),y=ada_f1_cross_val_scores,name='Adaboost'),
row=2, col=1
)
fig.add_trace(
go.Scatter(x=list(range(0,len(svm_f1_cross_val_scores))),y=svm_f1_cross_val_scores,name='SVM'),
row=3, col=1
)
fig.update_layout(height=700, width=900, title_text="Different Model 5 Fold Cross Validation")
fig.update_yaxes(visible=True,title_text="F1 Score")
fig.update_xaxes(visible=True,title_text="Fold")
fig.show()